In [1]:
# Import libraries
import requests
import pandas as pd
import re
import seaborn as sns
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import numpy as np
import re
import unicodedata
from sklearn.decomposition import PCA


%matplotlib inline

In part 1 (top universities) we will create functions to compute and plot the different required ratios that we will reuse in part 2.

To scrap the datas (for both website) we used postman and saw there were a json string we could scrap and parse to get the ranking. For www.topuniversities.com we had to scrap the university specific page to get some more complete informations such as the number of students.

www.topuniversities.com

The ranking is broken and contains only the top 199 universities even though the last university has rank 200, if we want to have 200 universities we should include the university ranked 201, as we want the top 200, we will take only the 199 first universities.


In [2]:
r = requests.get("https://www.topuniversities.com/sites/default/files/qs-rankings-data/357051.txt")
json_dict = r.json()
json_dict = json_dict['data']
json_dict = json_dict[:199]

for e in json_dict:
    #Delete useless informations
    del e['nid']
    del e['logo']
    del e['core_id']
    del e['score']
    del e['cc']
    del e['stars']
    del e['guide']
    
    #Get the university specific url
    r = requests.get("https://www.topuniversities.com/" + e['url'])
    soup = BeautifulSoup(r.text, 'html.parser')
    
    #Get faculty members numbers
    scrap_list = soup.find_all('div', class_='faculty-main wrapper col-md-4')
    if(len(scrap_list) == 0): #If no informations about the faculty members is given
        e['# Faculty members'] = 0
        e['# Int. faculty members'] = 0
    else:
        n_list = scrap_list[0].find_all(class_="number")
        if(len(n_list) == 2):
            e['# Faculty members'] = int(re.sub(',', '', n_list[0].text.strip()))
            e['# Int. faculty members'] = int(re.sub(',', '', n_list[1].text.strip()))
        else: #If they don't have international faculty member
            e['# Faculty members'] = int(re.sub(',', '', n_list[0].text.strip()))
            e['# Int. faculty members'] = 0
    
    #Get students numbers
    n_list = soup.find_all('div', class_='students-main wrapper col-md-4')
    if(len(n_list) != 0):
        e['# Students'] = int(re.sub(',', '', n_list[0].find_all(class_="number")[0].text.strip()))   
    else:
        e['# Students'] = 0

    n_list = soup.find_all('div', class_='int-students-main wrapper col-md-4')
    if(len(n_list) != 0):
        e['# Int. students'] = int(re.sub(',', '', n_list[0].find_all(class_="number")[0].text.strip()))
    else:
        e['# Int. students'] = 0
    
    #We don't need the url anymore, we can delete it
    del e['url']
    
df1 = pd.DataFrame(json_dict)

df1=df1.rename(columns = {'rank_display':'Rank', 'title' : 'University', 'country' : 'Country', 'region' : 'Region'})
df1["Rank"] = df1["Rank"].apply(lambda x: int(re.sub("[^0-9]", "", x)))
df1.index = df1['University']
del df1['University']

Per university ratio

Ratio between faculty members and students

Best university according to ratio between faculty members and students. We assume that a bigger ratio is better. Even though it might not be desirable, we assume it is the case for this exercise. We plot the bar charts only for the first 30 top universites (according to the ratio) to get an idea of the trend.


In [3]:
def uni_ratio_fac_stu(df):
    #Compute ratio
    df['Ratio Faculty members'] = df['# Faculty members']/df['# Students']

    #Sort the rows according to ratio and locate the top 30 ones
    df_sorted_ratios = df.sort_values(by=['Ratio Faculty members'], ascending=False).head(30)

    #Plot the bar chart and print the best one
    fig, ax = plt.subplots(figsize=(20,10))

    plot = sns.barplot(y=df_sorted_ratios['Ratio Faculty members'], x=df_sorted_ratios.index, ax=ax)
    plot.set_xticklabels(plot.get_xticklabels(), rotation=90)
    print("Best university according to ratio between faculty members and students is : " + df_sorted_ratios.iloc[0].name)

    #Delete the ratio column of the dataframe
    del df_sorted_ratios

In [4]:
uni_ratio_fac_stu(df1)


Best university according to ratio between faculty members and students is : California Institute of Technology (Caltech)

As we can see except for the first few universites which clearly have a higher ratio than the others, the ratio tends to decrease in a linear fashion

Let's compute the best university according to ratio between local and international students. We assume that a ratio of 1 is the best, that is the same number of local and international students. Again we only plot the first 30 top ones.


In [5]:
def uni_ratio_int_stu(df):
    #Compute ratio
    df['Ratio Int. students'] = df['# Int. students']/df['# Students']

    #Sort the rows according to ratio and locate the top 30 ones
    df_sorted_ratios = df.sort_values(by=['Ratio Int. students'], ascending=False).head(30)

    #Plot the bar chart and print the best one
    fig, ax = plt.subplots(figsize=(20,10))

    plot = sns.barplot(y=df_sorted_ratios['Ratio Int. students'], x=df_sorted_ratios.index, ax=ax)
    plot.set_xticklabels(plot.get_xticklabels(), rotation=90)

    print("Best university according to ratio between local and international students is : " + df_sorted_ratios.iloc[0].name)

    #Delete the ratio column of the dataframe
    del df_sorted_ratios

In [6]:
uni_ratio_int_stu(df1)


Best university according to ratio between local and international students is : London School of Economics and Political Science (LSE)

Same observations as for the previous bar charts. It is good to note that the universities and their order are not the same as the previous ones.

Per country ratio

Ratio between faculty members and students

Let's find the best country according to the ratio between faculty members and students. Previous assumptions about what is the best ratio still hold.


In [7]:
def country_ratio_fac_stu(df):
    #Group by country and compute the ratios
    df['Ratio'] = df['# Faculty members']/df['# Students']

    #Sort the rows according to ratios
    sorted_df = df.sort_values(by=['Ratio'], ascending=False)

    #Plot the bar chart and print the best one
    fig, ax = plt.subplots(figsize=(20,10))

    plot = sns.barplot(y=sorted_df.Ratio, x=sorted_df.index, ax=ax)
    plot.set_xticklabels(plot.get_xticklabels(), rotation=50)
    print("Best country according to ratio between faculty members and students is : " + sorted_df.iloc[0].name)

    #Delete datas we don't need anymore
    del df['Ratio']
    del sorted_df

In [8]:
df1_country = df1.groupby(['Country']).agg(np.mean)

In [9]:
country_ratio_fac_stu(df1_country)


Best country according to ratio between faculty members and students is : Russia

Russia is the exception with a high number in comparison of the others. Again the rest decreases in a linear fashion.

Ratio between local and international students

Let's find the best country according to the ratio between local and international students. Previous assumptions about what is the best ratio still hold.


In [10]:
def country_ratio_int_stu(df):
    #Compute the ratios
    df['Ratio'] = df['# Int. students']/df['# Students']

    #Sort the rows according to ratios
    sorted_df = df.sort_values(by=['Ratio'], ascending=False)

    #Plot the bar chart and print the best one
    fig, ax = plt.subplots(figsize=(20,10))

    plot = sns.barplot(y=sorted_df.Ratio, x=sorted_df.index, ax=ax)
    plot.set_xticklabels(plot.get_xticklabels(), rotation=50)
    print("Best country according to ratio between local and international students is : " + sorted_df.iloc[0].name)

    #Delete datas we don't need anymore
    del df
    del sorted_df

In [11]:
country_ratio_int_stu(df1_country)


Best country according to ratio between local and international students is : Australia

This bar chart is a bit different than the previous ones. Indeed the first ratios decrease way faster than the last ones but we could still use a combintion of two linear functions to approximate the decrease.

Per region ratio

Ratio between faculty members and students

Let's find the best region according to the ratio between faculty members and students. Previous assumptions about what is the best ratio still hold.


In [12]:
def region_ratio_fac_stu(df):
    #Group by region and compute the ratios
    df['Ratio'] = df['# Faculty members']/df['# Students']

    #Sort the rows according to ratios
    sorted_df = df.sort_values(by=['Ratio'], ascending=False)

    #Plot the bar chart and print the best one
    fig, ax = plt.subplots(figsize=(20,10))

    df = df.groupby(['Region']).agg(sum)
    plot = sns.barplot(y=sorted_df.Ratio, x=sorted_df.index, ax=ax)
    plot.set_xticklabels(plot.get_xticklabels(), rotation=50)

    print("Best region according to ratio between faculty members and students is : " + sorted_df.iloc[0].name)

    #Delete datas we don't need anymore
    del df['Ratio']

In [13]:
df1_region = df1.groupby(['Region']).agg(np.mean)

In [14]:
region_ratio_fac_stu(df1_region)


Best region according to ratio between faculty members and students is : Asia

Ratio between local and international students

Let's find the best region according to the ratio between local and international students. Previous assumptions about what is the best ratio still hold.


In [15]:
def region_ratio_int_stu(df):
    #Compute the ratios
    df['Ratio'] = df['# Int. students']/df['# Students']

    #Sort the rows according to ratios
    sorted_df = df.sort_values(by=['Ratio'], ascending=False)

    #Plot the bar chart and print the best one
    fig, ax = plt.subplots(figsize=(20,10))

    plot = sns.barplot(y=sorted_df.Ratio, x=sorted_df.index, ax=ax)
    plot.set_xticklabels(plot.get_xticklabels(), rotation=50)

    print("Best region according to ratio between local and international students is : " + sorted_df.iloc[0].name)

    #Delete datas we don't need anymore
    del df
    del sorted_df

In [16]:
region_ratio_int_stu(df1_region)


Best region according to ratio between local and international students is : Oceania

Oceania is clearly the leader here.

www.timeshighereducation.com

Scraping

Using Postman when browsing the page with the ranking, I saw that a request for a json file was made. It actually contains all the information we need in a single file.

Helper to get the region way more easily


In [17]:
countries = dict(df1[['Country', 'Region']].drop_duplicates().values.tolist())
countries['Luxembourg'] = 'Europe'

URL containing the requested data in json


In [18]:
timeshighereducation_url = "https://www.timeshighereducation.com/sites/default/files/the_data_rankings/world_university_rankings_2018_limit0_369a9045a203e176392b9fb8f8c1cb2a.json"

ranking_brute = requests.get(timeshighereducation_url).json()['data']

In [19]:
infos_to_keep = ["rank", "name", "location", "stats_number_students", "stats_pc_intl_students", "stats_student_staff_ratio"]
column_names =  ["Rank", "University", "Country", "# Students", "% Int. students", "% Faculty members"]

In [20]:
# creating dataframe
df2 = pd.DataFrame(ranking_brute[:200], index=range(1, 201), columns=infos_to_keep)

# more expressive column names
df2.columns = column_names

df2["Rank"] = df2["Rank"].apply(lambda x: int(re.sub("[^0-9]", "", x)))

# string to numerical values
df2["# Students"] = pd.to_numeric(df2["# Students"].map(lambda x: x.replace(",", "")))
df2["% Int. students"] = pd.to_numeric(df2["% Int. students"].map(lambda x: x.replace("%", ".")))
df2["% Faculty members"] = pd.to_numeric(df2["% Faculty members"])

# adding number of international students
df2["# Int. students"] = (df2["# Students"] * df2["% Int. students"] / 100).map(round)

# adding number of faculty members
df2["# Faculty members"] = (df2["# Students"] / df2["% Faculty members"]).map(round)

# Changing Russian federation to Russia
df2['Country'].replace('Russian Federation', 'Russia', inplace=True)

# adding regions
df2["Region"] = df2["Country"].map(lambda c: countries[c])

# Usign meaningful index
df2.index = df2['University']
del df2['University']

# Deleting % Int. students because it is not needed anymore
del df2['% Int. students']
del df2["% Faculty members"]

Per university ratio

Ratio between faculty members and students

Best university according to ratio between faculty members and students. We assume that a bigger ratio is better. Even though it might not be desirable, we assume it is the case for this exercise. Again we plots only the first 30 ones.


In [21]:
uni_ratio_fac_stu(df2)


Best university according to ratio between faculty members and students is : Vanderbilt University

One university is clearly above the others, then it decreases in a linear fashion.

Ratio between local and international students

Best university according to ratio between local and international students. We assume that a ratio of 1 is the best, that is the same number of local and international students.


In [22]:
uni_ratio_int_stu(df2)


Best university according to ratio between local and international students is : London School of Economics and Political Science

We can see that LSE school has 70% of international students which is really impressive.

Per country ratio

Ratio between faculty members and students

Let's find the best country according to 1) ratio between faculty members and students and 2) ratio between local and international students. Previous assumptions about what is the best ratio still hold.


In [23]:
df2_country = df2.groupby(['Country']).agg(np.mean)

In [24]:
country_ratio_fac_stu(df2_country)


Best country according to ratio between faculty members and students is : Denmark

In Denmark there are around 6 students for a staff member. It's clearly above the average.

Ratio between local and international students


In [25]:
country_ratio_int_stu(df2_country)


Best country according to ratio between local and international students is : Luxembourg

English talking countries are more international (in term of international students ratio) in general and also small european countries like Luxembourg (really impressive how high), Austria and Switzerland.

Let's find the best region according to 1) ratio between faculty members and students and 2) ratio between local and international students. Previous assumptions about what is the best ratio still hold.

Per region ratio

Ratio between faculty members and students


In [26]:
df2_region = df2.groupby(['Region']).agg(np.mean)

In [27]:
region_ratio_fac_stu(df2_region)


Best region according to ratio between faculty members and students is : Africa

It's quite suprising to see Africa first. We wouldn't have expect that at first. Although Europe is quite at the back.

Ratio between local and international students


In [28]:
region_ratio_int_stu(df2_region)


Best region according to ratio between local and international students is : Oceania

Quick observations on both ranking

  • Oceania followed by Europe have the biggest ratio between local and internaionl students in both ranking
  • Latin america universities are not present in the top 200 of times higher education
  • Russia tends to have a lot of faculty members, it is at the top (not necessarily first place) in both bar charts.
  • Countries ratios are different between the two rankings which is expected as the universities are not the same, but we can see some countries around the same position in the bar charts of both ranking. This might show that in some cases the universities of a specific country tend to have more or less the same ratio of faculty members or international students. It could come from many factors, such as law enforcment, the culture or the international reputation of a country.
  • More observations are made below.

Combining the two websites

To combine the two rankings, we will be using the name of the universities. However, the names are rarely exactly the same in both rankings, which means that we need to use regex to keep the important part of the name only.
Sometimes regex will not be enough, and we will need to change the name in one of the dataframe manually.

We start by working on the cases where regex is not enough.


In [29]:
df2.rename({'ETH Zurich – Swiss Federal Institute of Technology Zurich' : 'ETH Zurich – Swiss Federal Institute of Technology',
           'Wageningen University & Research' : 'Wageningen University'}, inplace=True)
df1.rename({'UCL (University College London)': 'University College London',
           'KAIST - Korea Advanced Institute of Science & Technology' : 'Korea Advanced Institute of Science and Technology (KAIST)',
           'Ludwig-Maximilians-Universität München' : 'LMU Munich',
           'Ruprecht-Karls-Universität Heidelberg' : 'Heidelberg University',
           'University of North Carolina, Chapel Hill' : 'University of North Carolina at Chapel Hill',
           'Trinity College Dublin, The University of Dublin' : 'Trinity College Dublin',
           'KIT, Karlsruhe Institute of Technology' : 'Karlsruhe Institute of Technology',
           'Humboldt-Universität zu Berlin' : 'Humboldt University of Berlin',
           'Freie Universitaet Berlin': 'Free University of Berlin',
           'Université de Montréal' : 'University of Montreal',
           'Université Pierre et Marie Curie (UPMC)' : 'Pierre and Marie Curie University',
           'Technische Universität Berlin (TU Berlin)' : 'Technical University of Berlin',
           'Universitat Autònoma de Barcelona' : 'Autonomous University of Barcelona',
           'Eberhard Karls Universität Tübingen' : 'University of Tübingen',
           'Albert-Ludwigs-Universitaet Freiburg' : 'University of Freiburg',
           'Scuola Superiore Sant\'Anna Pisa di Studi Universitari e di Perfezionamento': 'Scuola Superiore Sant’Anna'}, inplace=True)

And then we use regex on every university name.


In [30]:
regex1 = r'\([^()]*\)'
regex2 = r"(the)|(university)|(of)|(de)|(-)|( )|(’)|(')|\."
p = re.compile('('+regex1+'|'+regex2+')')

for df in [df1, df2]:
    df['University_regex'] = df.index.values
    # We first take care of the case
    df['University_regex'] = df['University_regex'].apply(lambda x: x.casefold())
    # Then we remove useless words, spaces, parentheses, their content and so on
    df['University_regex'] = df['University_regex'].apply(lambda x: str(re.sub(p, '', x)))
    # We then normalize the text to remove accents
    df['University_regex'] = df['University_regex'].apply(lambda x: unicodedata.normalize("NFKD", x))
    # And swith the encoding to utf-8
    df['University_regex'] = df['University_regex'].apply(lambda x: x.rstrip().encode('ascii', errors='ignore')
                                                                     .decode('utf-8'))

Once we know that the data is ready to be merged, we apply an inner merge. We chose to use an inner merge instead of an outer merge because it makes more sense to only keep universities present in both rankings.


In [31]:
df_merged = df2.reset_index().merge(df1, how='inner', on='University_regex').set_index('University')

Once the merge is done, we select and clean the data we are interested in. Moreover, since the number of student is different depending on the ranking, we decided to keep the average.


In [32]:
#Rename columns and put them in a new dataframe
df_all = df_merged.copy()
df_all['# Students (topuni)'] = df_merged['# Students_x']
df_all['# Students (times)'] = df_merged['# Students_y']
df_all['# Int. students (topuni)'] = df_merged['# Int. students_x']
df_all['# Int. students (times)'] = df_merged['# Int. students_y']
df_all['# Faculty members (topuni)'] = df_merged['# Faculty members_x']
df_all['# Faculty members (times)'] = df_merged['# Faculty members_y']
df_all['Rank topuni'] = df_merged['Rank_x']
df_all['Rank times'] = df_merged['Rank_y']
df_all['Country'] = df_merged['Country_x']
df_all['Region'] = df_merged['Region_x']
df_all['Ratio Int. students (topuni)'] = df_merged['Ratio Int. students_x']
df_all['Ratio Int. students (times)'] = df_merged['Ratio Int. students_y']
df_all['Ratio Faculty members (topuni)'] = df_merged['Ratio Faculty members_x']
df_all['Ratio Faculty members (times)'] = df_merged['Ratio Faculty members_y']

#Columns to keep
df_all = df_all[['# Students (topuni)', '# Students (times)',
                 '# Int. students (topuni)', '# Int. students (times)', '# Faculty members (topuni)',
                 '# Faculty members (times)', 'Rank topuni', 'Rank times', 'Country', 'Region',
                 'Ratio Int. students (topuni)', 'Ratio Int. students (times)', 'Ratio Faculty members (topuni)', 'Ratio Faculty members (times)']]
del df_merged

Plot of the correlation


In [33]:
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(df_all.corr(), ax=ax, square=True)


Out[33]:
<matplotlib.axes._subplots.AxesSubplot at 0x11dfde0f0>

Above, a few things are interesting to notice:

  • When comparing the same corresponding data for each website, we can clearly distinguish a strong correlation which is something we can expect.
  • There is a negative correlation between the rankings and every other feature (except the # Students) which is logical since the rankings are the only caracteristics where lower is better.
  • There is a higher (absolute) correlation between the ranking of TopUniversity and the number of students than for Times Higher Education. We could guess from that that TopUniversity's criterias for the ranking give more importance to the size of the university than Times Higher Education's criterias.
  • There is an (absolute) correlation between the ratios and the rankings (higher ratios would have better ranks on average).
  • The number of faculty members is correlated with the number of students. Which is logic because it is not desirable to have a university with a lot of faculty members but very few students and inversly.
  • It is good to note that both rankings seem not to always agree on the number of students, faculty members and international students as the correlation is not of 1.

Note that by 'absolute' correlation we mean the absolute value of the correlation

Best university

Basic method

To find the best university according to both ranking, we could use the following method: We take the mean of both ranking and the university with the smallest mean is the best university. With this method we try to find the university closest to the top in both rankings.


In [34]:
df_all['Rank mean'] = (df_all['Rank times'] + df_all['Rank topuni'])/2.0
best = df_all.sort_values(by=['Rank mean'], ascending=True).iloc[0]

print("The best university according to both ranking is " + best.name + " with a mean rank score of {}".format(best["Rank mean"]))

del df_all['Rank mean']


The best university according to both ranking is Stanford University with a mean rank score of 2.5

Alternative method

What we could do, is extract the principal components of the data (A PCA, basically) and project the points on the main component.


In [35]:
pca = PCA(n_components=2)
# Creates pairs of ranks for each university
ranks = np.array([df_all[['Rank times']].values.flatten(), df_all[['Rank topuni']].values.flatten()]).T
# Tells the PCA what the data is and then get the data in its new basis
ranks_new_basis = pca.fit(ranks).transform(ranks)
# Get the ratio of importance for each component
pca_score = pca.explained_variance_ratio_
# Get the components
V = pca.components_
# Multiplies the components by their ratio of importance compared to the least important component
ranks_pca_axis_x, ranks_pca_axis_y = V.T * pca_score / pca_score.min()

We then plotted the different Universities according to their respective ranking and showed the first two main component of the data (green and red).


In [36]:
starting_point = (100, 100)

fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(111)
plt.title('Universities ranking')
plt.plot(df_all[['Rank times']].values.flatten(), df_all[['Rank topuni']].values.flatten(), '.')
plt.plot([starting_point[0], starting_point[0] + ranks_pca_axis_x[0]*20], [starting_point[1], starting_point[1] + ranks_pca_axis_y[0]*20], label='Main component')
plt.plot([starting_point[0], starting_point[0] + ranks_pca_axis_x[1]*20], [starting_point[1], starting_point[1] + ranks_pca_axis_y[1]*20], label='Second component')
plt.xlabel('Ranking according to Times')
plt.ylabel('Ranking according to Topuni')
plt.legend()
plt.show()


Once we have the principle component, we can use it to transform our data and change its basis from (rank times, rank topuni) to (main component, second component)


In [37]:
fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(111)
plt.scatter(ranks_new_basis.T[0], ranks_new_basis.T[1])

# Creates main component arrow
plt.plot([-125, 150], [0, 0], c='g', label='Main component')
plt.plot([140, 150], [4, 0], c='g')
plt.plot([140, 150], [-4, 0], c='g')

plt.xlabel('Main component')
plt.ylabel('Second component')
plt.title('Universities ranking in new basis')
plt.legend()
plt.show()


To get a final ranking system, we simply project the points on the main component axis and rate them depending on how far left they are. The more on the left, the better the rank.


In [38]:
# Keep only main component value of data
ranks_pca1 = ranks_new_basis.T[0]
# Get indices of sorted data
temp = ranks_pca1.argsort()
# Creates new array and then aranges values depending on indices of sorted data
new_ranks = np.empty(len(ranks_pca1), int)
new_ranks[temp] = np.arange(len(ranks_pca1))

We now add this new rank in the DataFrame and show the 20 first universities in our new ranking system.


In [39]:
df_all['Rank total'] = new_ranks + 1

Best university according to both ranking :


In [40]:
print("Best university according to PCA is : " + df_all.sort_values(by='Rank total').iloc[0].name)


Best university according to PCA is : Stanford University

As we can see with both the basic and alternative method Stanford University is the best according to both ranking. Which is not surpising considering its excellent reputation.